# Import train.csv, test.csv and store.csv
train <- read.csv("train.csv", stringsAsFactors = F)
test <- read.csv("test.csv", stringsAsFactors = F)
store <- read.csv("store.csv", stringsAsFactors = F)
| No. of Rows | No. of Columns | |
|---|---|---|
| Train | 1017209 | 9 |
| Test | 41088 | 8 |
| Store | 1115 | 10 |
| No. of NAs | |
|---|---|
| Store | 0 |
| DayOfWeek | 0 |
| Date | 0 |
| Sales | 0 |
| Customers | 0 |
| Open | 0 |
| Promo | 0 |
| StateHoliday | 0 |
| SchoolHoliday | 0 |
| No. of NAs | |
|---|---|
| Id | 0 |
| Store | 0 |
| DayOfWeek | 0 |
| Date | 0 |
| Open | 11 |
| Promo | 0 |
| StateHoliday | 0 |
| SchoolHoliday | 0 |
Observations:
Hence, we will impute 1 into the NA values for the ‘Open’ variable in the test dataset.
| No. of NAs | |
|---|---|
| Store | 0 |
| StoreType | 0 |
| Assortment | 0 |
| CompetitionDistance | 3 |
| CompetitionOpenSinceMonth | 354 |
| CompetitionOpenSinceYear | 354 |
| Promo2 | 0 |
| Promo2SinceWeek | 544 |
| Promo2SinceYear | 544 |
| PromoInterval | 0 |
# a. Retrieve records with Open = NA
test %>% filter(is.na(Open)) %>% html_df()
| Id | Store | DayOfWeek | Date | Open | Promo | StateHoliday | SchoolHoliday |
|---|---|---|---|---|---|---|---|
| 480 | 622 | 4 | 2015-09-17 | NA | 1 | 0 | 0 |
| 1336 | 622 | 3 | 2015-09-16 | NA | 1 | 0 | 0 |
| 2192 | 622 | 2 | 2015-09-15 | NA | 1 | 0 | 0 |
| 3048 | 622 | 1 | 2015-09-14 | NA | 1 | 0 | 0 |
| 4760 | 622 | 6 | 2015-09-12 | NA | 0 | 0 | 0 |
| 5616 | 622 | 5 | 2015-09-11 | NA | 0 | 0 | 0 |
| 6472 | 622 | 4 | 2015-09-10 | NA | 0 | 0 | 0 |
| 7328 | 622 | 3 | 2015-09-09 | NA | 0 | 0 | 0 |
| 8184 | 622 | 2 | 2015-09-08 | NA | 0 | 0 | 0 |
| 9040 | 622 | 1 | 2015-09-07 | NA | 0 | 0 | 0 |
| 10752 | 622 | 6 | 2015-09-05 | NA | 0 | 0 | 0 |
# b. Replace NA with Open = 1
test <- test %>% mutate(Open = replace(Open, is.na(Open),1))
# c. Check if NA has been replaced:
sum(is.na(test$Open))
## [1] 0
#a. Train
train <- train %>% mutate(
DayOfWeek = as.factor(DayOfWeek),
Date = as.Date(Date),
Open = as.factor(Open),
Promo = as.factor(Promo),
StateHoliday = as.factor(StateHoliday), # Has 4 values!
SchoolHoliday = as.factor(SchoolHoliday))
#Day = as.integer(format(train$Date, "%d")), # New variable 1
#Month = as.integer(format(train$Date, "%m")), # New variable 2
#Year = as.integer(format(train$Date, "%Y"))) # New variable 3
str(train)
## 'data.frame': 1017209 obs. of 9 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : Factor w/ 7 levels "1","2","3","4",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ Date : Date, format: "2015-07-31" "2015-07-31" ...
## $ Sales : int 5263 6064 8314 13995 4822 5651 15344 8492 8565 7185 ...
## $ Customers : int 555 625 821 1498 559 589 1414 833 687 681 ...
## $ Open : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Promo : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ StateHoliday : Factor w/ 4 levels "0","a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
## $ SchoolHoliday: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
#b. Test
test <- test %>% mutate(
DayOfWeek = as.factor(DayOfWeek),
Date = as.Date(Date),
Open = as.factor(Open),
Promo = as.factor(Promo),
StateHoliday = as.factor(StateHoliday), # Only 2 values! What're the state holidays?
SchoolHoliday = as.factor(SchoolHoliday))
#Day = as.integer(format(test$Date, "%d")), # New variable 1
#Month = as.integer(format(test$Date, "%m")), # New variable 2
#Year = as.integer(format(test$Date, "%Y"))) # New variable 3
str(test)
## 'data.frame': 41088 obs. of 8 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Store : int 1 3 7 8 9 10 11 12 13 14 ...
## $ DayOfWeek : Factor w/ 7 levels "1","2","3","4",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ Date : Date, format: "2015-09-17" "2015-09-17" ...
## $ Open : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Promo : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ StateHoliday : Factor w/ 2 levels "0","a": 1 1 1 1 1 1 1 1 1 1 ...
## $ SchoolHoliday: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
#c. Store
store <- store %>% mutate(
StoreType = as.factor(StoreType),
Assortment = as.factor(Assortment),
Promo2 = as.factor(Promo2),
PromoInterval = as.factor(PromoInterval))
str(store)
## 'data.frame': 1115 obs. of 10 variables:
## $ Store : int 1 2 3 4 5 6 7 8 9 10 ...
## $ StoreType : Factor w/ 4 levels "a","b","c","d": 3 1 1 3 1 1 1 1 1 1 ...
## $ Assortment : Factor w/ 3 levels "a","b","c": 1 1 1 3 1 1 3 1 3 1 ...
## $ CompetitionDistance : int 1270 570 14130 620 29910 310 24000 7520 2030 3160 ...
## $ CompetitionOpenSinceMonth: int 9 11 12 9 4 12 4 10 8 9 ...
## $ CompetitionOpenSinceYear : int 2008 2007 2006 2009 2015 2013 2013 2014 2000 2009 ...
## $ Promo2 : Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 1 1 ...
## $ Promo2SinceWeek : int NA 13 14 NA NA NA NA NA NA NA ...
## $ Promo2SinceYear : int NA 2010 2011 NA NA NA NA NA NA NA ...
## $ PromoInterval : Factor w/ 4 levels "","Feb,May,Aug,Nov",..: 1 3 3 1 1 1 1 1 1 1 ...
train.store <- merge(train, store, by = "Store")
Sales data start from 2013-01-01 to 2015-07-31, which spans a total of 941 days or 2 years 7 months.
plot(train$Date, type = "l")
plot(test$Date, type = "l")
No () visible breaks in data, hence no missing data by date.
Sunday has the least sales for all opened stores over the data period, and that could be because most stores are closed on Sundays.
# Check if closed stores have any sales. Result = no anomalies.
train.closed <- train[train$Open == 0,]
train.closed$Sales %>% sum()
## [1] 0
# First plot
ggplot(data = train, aes (x= DayOfWeek, y= Sales)) +
geom_bar(stat = "identity")
# Second plot
train %>% group_by(DayOfWeek, Open) %>% tally() %>%
ggplot(aes(x=DayOfWeek, y=n, fill = Open)) +
geom_bar(stat="identity")
# PromotionInterval
ggplot(train.store, aes(x = factor(PromoInterval), y = Sales, color = PromoInterval)) +
geom_col() +
ggtitle("Sales by PromoInterval")
# StoreType
ggplot(train.store, aes(x = Date, y = Sales, color = StoreType))+
geom_smooth(se= F, size = 1.5) +
ggtitle("Sales by StoreType")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(train.store, aes(x = Date, y = Customers, color = StoreType)) +
geom_smooth(se= F, size = 1.5) +
ggtitle("Customers by StoreType")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
# Assortment
ggplot(train.store, aes(x = Date, y = Sales, color = Assortment)) +
geom_smooth(se= F, size = 1.5) +
ggtitle("Sales by Assortment")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(train.store, aes(x = Date, y = Customers, color = Assortment)) +
geom_smooth(se= F, size = 1.5) +
ggtitle("Customers by Assortment")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## 4. CompetitionDistance, OpenSinceMonth/Year
# Combine year and month into one date variable:
store$CompetitionOpenSince <-as.yearmon(paste(store$CompetitionOpenSinceYear,
store$CompetitionOpenSinceMonth, sep = "-"))
# P.S: yearmon functon creates a numeric vector interpreted in "years" and fractions of years. e.g. 1961.5 = June 1961.
# Histogram for CompetitionOpenedSince
plot_ly(x= store$CompetitionOpenSince, type = "histogram") %>%
layout(title = "Distribution of CompetitionOpenedSince",
xaxis = list(title = "Year",
zeroline = FALSE),
yaxis = list(title = "Count",
zeroline = FALSE))
## Warning: Ignoring 354 observations
Observations: Many competitors opened recently, except 1 that opened in 1900 and 1 in 1961.
# Combine year and month into one date variable:
store$Promo2Since <- as.POSIXct(paste(store$Promo2SinceYear,
store$Promo2SinceWeek, 1, sep = "-"),
format = "%Y-%U-%u")
hist(as.numeric(as.POSIXct("2015-10-01", format = "%Y-%m-%d") - store$Promo2Since),
100, main = "Days since start of promo2")
# Histogram for Promo2Since (in days)
plot_ly(x= as.POSIXct("2015-10-01", format = "%Y-%m-%d") - store$Promo2Since, type = "histogram") %>%
layout(title = "Distribution of Promo2Since",
xaxis = list(title = "Days",
zeroline = FALSE),
yaxis = list(title = "Count",
zeroline = FALSE))
## Warning: Ignoring 544 observations
# MeanSales by CompetitionDistance
salesbydist <- train.store %>% group_by(CompetitionDistance) %>% summarise(MeanSales = mean(Sales, na.rm=TRUE))
## NOTE: Plotting without mean makes everthing too cluttered. Code below can't see shit. Followed online guide.
## ggplot(train.store, aes(x = CompetitionDistance, y = Sales)) + geom_point() + geom_smooth()
# salesbydist scatterplot
ggplot(salesbydist, aes(x = CompetitionDistance, y = MeanSales)) +
geom_point() + geom_smooth() + scale_x_log10() + scale_y_log10()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
Observations: Interestingly, stores with competition that are closer have slightly higher sales on average while those with competition that are further have slightly lower sales. Just based on this graph alone, we cannot deduce much, but a possibility is that the stores with close competitors are situated in areas with high footfall such as cities, contributing to slightly higher revenue.
train.store <- merge(train, store, by = "Store")
train.store2 <- train.store %>% dplyr:: select(
DayOfWeek, #1
Sales, #2
Customers, #3
Open, #4
Promo, #5
StateHoliday, #6
SchoolHoliday, #7
StoreType, #8
Assortment, #9
CompetitionDistance, #10
Promo2, #11
PromoInterval, #12
CompetitionOpenSince) #13
#Promo2Since) #14
str(train.store2)
## 'data.frame': 1017209 obs. of 13 variables:
## $ DayOfWeek : Factor w/ 7 levels "1","2","3","4",..: 5 6 5 3 3 7 3 1 5 1 ...
## $ Sales : int 5263 4952 4190 6454 3310 0 3591 4770 3836 3722 ...
## $ Customers : int 555 646 552 695 464 0 453 542 466 480 ...
## $ Open : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 2 2 2 2 ...
## $ Promo : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 1 1 ...
## $ StateHoliday : Factor w/ 4 levels "0","a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
## $ SchoolHoliday : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 1 1 ...
## $ StoreType : Factor w/ 4 levels "a","b","c","d": 3 3 3 3 3 3 3 3 3 3 ...
## $ Assortment : Factor w/ 3 levels "a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
## $ CompetitionDistance : int 1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
## $ Promo2 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ PromoInterval : Factor w/ 4 levels "","Feb,May,Aug,Nov",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CompetitionOpenSince: 'yearmon' num Sep 2008 Sep 2008 Sep 2008 Sep 2008 ...
# Run lm first
train.mlm <- lm(Sales ~., data = train.store2)
str(train.store)
## 'data.frame': 1017209 obs. of 20 variables:
## $ Store : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayOfWeek : Factor w/ 7 levels "1","2","3","4",..: 5 6 5 3 3 7 3 1 5 1 ...
## $ Date : Date, format: "2015-07-31" "2013-01-12" ...
## $ Sales : int 5263 4952 4190 6454 3310 0 3591 4770 3836 3722 ...
## $ Customers : int 555 646 552 695 464 0 453 542 466 480 ...
## $ Open : Factor w/ 2 levels "0","1": 2 2 2 2 2 1 2 2 2 2 ...
## $ Promo : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 1 1 ...
## $ StateHoliday : Factor w/ 4 levels "0","a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
## $ SchoolHoliday : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 1 1 ...
## $ StoreType : Factor w/ 4 levels "a","b","c","d": 3 3 3 3 3 3 3 3 3 3 ...
## $ Assortment : Factor w/ 3 levels "a","b","c": 1 1 1 1 1 1 1 1 1 1 ...
## $ CompetitionDistance : int 1270 1270 1270 1270 1270 1270 1270 1270 1270 1270 ...
## $ CompetitionOpenSinceMonth: int 9 9 9 9 9 9 9 9 9 9 ...
## $ CompetitionOpenSinceYear : int 2008 2008 2008 2008 2008 2008 2008 2008 2008 2008 ...
## $ Promo2 : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ Promo2SinceWeek : int NA NA NA NA NA NA NA NA NA NA ...
## $ Promo2SinceYear : int NA NA NA NA NA NA NA NA NA NA ...
## $ PromoInterval : Factor w/ 4 levels "","Feb,May,Aug,Nov",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ CompetitionOpenSince : 'yearmon' num Sep 2008 Sep 2008 Sep 2008 Sep 2008 ...
## $ Promo2Since : POSIXct, format: NA NA ...
# Ultimate step-wise regression...is useless in feature selection here...
training.swr <- step(train.mlm, direction = "both")
## Start: AIC=9815157
## Sales ~ DayOfWeek + Customers + Open + Promo + StateHoliday +
## SchoolHoliday + StoreType + Assortment + CompetitionDistance +
## Promo2 + PromoInterval + CompetitionOpenSince
##
##
## Step: AIC=9815157
## Sales ~ DayOfWeek + Customers + Open + Promo + StateHoliday +
## SchoolHoliday + StoreType + Assortment + CompetitionDistance +
## PromoInterval + CompetitionOpenSince
##
## Df Sum of Sq RSS AIC
## <none> 9.6527e+11 9815157
## - CompetitionOpenSince 1 5.6413e+08 9.6583e+11 9815561
## - SchoolHoliday 1 1.4022e+09 9.6667e+11 9816162
## - StateHoliday 3 2.1408e+09 9.6741e+11 9816688
## - Open 1 2.4120e+09 9.6768e+11 9816887
## - PromoInterval 3 1.3733e+10 9.7900e+11 9824953
## - CompetitionDistance 1 1.4777e+10 9.8004e+11 9825696
## - DayOfWeek 6 3.2704e+10 9.9797e+11 9838264
## - Assortment 2 8.7833e+10 1.0531e+12 9875581
## - StoreType 3 1.3865e+11 1.1039e+12 9908279
## - Promo 1 1.7183e+11 1.1371e+12 9928832
## - Customers 1 3.5124e+12 4.4776e+12 10879848
summary(training.swr)
##
## Call:
## lm(formula = Sales ~ DayOfWeek + Customers + Open + Promo + StateHoliday +
## SchoolHoliday + StoreType + Assortment + CompetitionDistance +
## PromoInterval + CompetitionOpenSince, data = train.store2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29606.6 -712.9 -45.0 582.8 26414.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.379e+03 4.801e+02 19.535 < 2e-16 ***
## DayOfWeek2 -4.600e+02 5.334e+00 -86.240 < 2e-16 ***
## DayOfWeek3 -6.026e+02 5.346e+00 -112.717 < 2e-16 ***
## DayOfWeek4 -6.691e+02 5.358e+00 -124.884 < 2e-16 ***
## DayOfWeek5 -5.584e+02 5.312e+00 -105.118 < 2e-16 ***
## DayOfWeek6 -1.826e+02 5.738e+00 -31.829 < 2e-16 ***
## DayOfWeek7 -3.489e+02 1.925e+01 -18.124 < 2e-16 ***
## Customers 7.514e+00 4.729e-03 1588.932 < 2e-16 ***
## Open1 8.016e+02 1.925e+01 41.639 < 2e-16 ***
## Promo1 1.208e+03 3.438e+00 351.447 < 2e-16 ***
## StateHolidaya -3.308e+02 2.070e+01 -15.977 < 2e-16 ***
## StateHolidayb -7.889e+02 2.570e+01 -30.702 < 2e-16 ***
## StateHolidayc 1.966e+02 2.932e+01 6.707 1.99e-11 ***
## SchoolHoliday1 1.247e+02 3.929e+00 31.748 < 2e-16 ***
## StoreTypeb -2.562e+03 1.659e+01 -154.466 < 2e-16 ***
## StoreTypec -1.516e+02 4.259e+00 -35.595 < 2e-16 ***
## StoreTyped 9.055e+02 3.497e+00 258.923 < 2e-16 ***
## Assortmentb -6.050e+03 2.751e+01 -219.896 < 2e-16 ***
## Assortmentc 3.670e+02 3.071e+00 119.490 < 2e-16 ***
## CompetitionDistance 1.960e-02 1.902e-04 103.061 < 2e-16 ***
## PromoIntervalFeb,May,Aug,Nov 2.170e+02 4.936e+00 43.971 < 2e-16 ***
## PromoIntervalJan,Apr,Jul,Oct 3.280e+02 3.394e+00 96.635 < 2e-16 ***
## PromoIntervalMar,Jun,Sept,Dec 2.064e+02 5.417e+00 38.094 < 2e-16 ***
## CompetitionOpenSince -4.809e+00 2.388e-01 -20.137 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1179 on 693837 degrees of freedom
## (323348 observations deleted due to missingness)
## Multiple R-squared: 0.9074, Adjusted R-squared: 0.9073
## F-statistic: 2.954e+05 on 23 and 693837 DF, p-value: < 2.2e-16